imdb_ratings explicit from IMDb (for obvious
reasons)dim(simpsons)
## [1] 725 12
names(simpsons)
## [1] "id" "title" "description"
## [4] "original_air_date" "production_code" "directed_by"
## [7] "written_by" "season" "number_in_season"
## [10] "number_in_series" "us_viewers_in_millions" "imdb_rating"
summary(simpsons)
## id title description original_air_date
## Min. : 0 Length:725 Length:725 Min. :1989-12-17
## 1st Qu.:181 Class :character Class :character 1st Qu.:1997-10-26
## Median :362 Mode :character Mode :character Median :2005-11-27
## Mean :362 Mean :2006-01-04
## 3rd Qu.:543 3rd Qu.:2014-03-16
## Max. :724 Max. :2022-05-22
## production_code directed_by written_by season
## Length:725 Length:725 Length:725 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 9.00
## Mode :character Mode :character Mode :character Median :17.00
## Mean :16.94
## 3rd Qu.:25.00
## Max. :33.00
## number_in_season number_in_series us_viewers_in_millions imdb_rating
## Min. : 1.00 Min. : 1 Length:725 Min. :4.000
## 1st Qu.: 6.00 1st Qu.: 182 Class :character 1st Qu.:6.600
## Median : 12.00 Median : 363 Mode :character Median :7.100
## Mean : 15.98 Mean : 3123 Mean :7.166
## 3rd Qu.: 17.00 3rd Qu.: 544 3rd Qu.:7.700
## Max. :1920.00 Max. :712713 Max. :9.300
sum(is.na(simpsons))
## [1] 0
head(simpsons) %>% rmarkdown::paged_table()
summary(simpsons$length_description)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 18.00 23.00 23.92 28.00 98.00
imdb_rating to ratingus_viewers_in_millions to viewerslength_description and age (Time since
original air date)
age analysis is conducted will be
based off this date as newer seasons start
around this timesimpsons$description[which(simpsons$length_description == 5)]
## [1] "Marge becomes a real-estate agent."
## [2] "Marge accidentally gets breast implants."
## [3] "Fat Tony becomes Maggie's godfather."
Initial observations off of summary:
Potential errors in data based off summary:
Error in number_in_series (Max 712713)
Number episode in season (Max 1920)
No blatant missing values but some numeric values listed as different type
us_viewers_in_million) is char typeRefer to README for other preliminary questions
rmarkdown::paged_table(head(arrange(simpsons, desc(number_in_series))))
#DT::datatable(head(arrange(simpsons, desc(number_in_series))))
subsetDF <- c("id", "title", "production_code", "directors", "writers", "number_in_season", "number_in_series")
simpsons_sub <- simpsons[,subsetDF]
number_in_series in which reveals there
cases where records were wrong
subsetDF
## [1] "id" "title" "production_code" "directors"
## [5] "writers" "number_in_season" "number_in_series"
#simpsons[which.max(simpsons$number_in_season),]
#head(arrange(simpsons, desc(number_in_season)))
rmarkdown::paged_table(head(arrange(simpsons_sub[c("id", "title","number_in_season")], desc(number_in_season))))
num_in_series
number_in_season is 1213
number_in_series use
id, however number_in_season adjusted
accordingly as two episodessimpsons$viewers <- as.numeric(simpsons$us_viewers_in_millions)
## Warning: NAs introduced by coercion
which(is.na(as.numeric(simpsons$viewers)))
## [1] 160 161 173
simpsons[c(160, 161, 173),c("us_viewers_in_millions", "viewers")]
## # A tibble: 3 × 2
## us_viewers_in_millions viewers
## <chr> <dbl>
## 1 N/A NA
## 2 N/A NA
## 3 N/A NA
simpsons_rm <- simpsons[-c(160, 161, 173),]
as.numeric() for further
applicationis.na() didn’t detect because it was of class
char(Match id to 607, 679, 709)
Note: Graphs expected to fall under time series analysis, and using
id/original_air_date/age are
somewhat interchangeable since they are effectively
factors/‘categorical.’
temp <- ggplot(data = tempSimps, aes(x = as.factor(season), y = rating, color=as.factor(season))) + geom_boxplot() + ggtitle("IMDB Rating vs Simpsons Seasons") + xlab("Season") + ylab("Rating") + theme(legend.position = "none")
ggplotly(temp)
viewerstempViewers <- ggplot(data = tempSimps, aes(x = as.factor(season), y = as.numeric(viewers), color=as.factor(season))) + geom_boxplot() + ggtitle("Viewers vs Simpsons Seasons") + xlab("Season") + ylab("Viewers (p/ mil)") +theme(legend.position = "none")
ggplotly(tempViewers)
tempPlot <- ggplot(data = tempSimps,
aes(x = id, y = rating, col = as.factor(season), shape=as.factor(season))) + geom_point() +
ylab("Rating") + xlab("Episode Number") + ggtitle("Rating vs Episode") +
geom_line() + geom_line() + scale_shape_manual(values = rep(0:14, 3)) +
theme(legend.position = "none")
ggplotly(tempPlot)
#ggplot(data = simpsons, aes(x=id, y = rating)) + geom_point()
#ggplot(data = simpsons, aes(x=original_air_date, y = rating)) + geom_point()
tempPlot <- ggplot(data=tempSimps,
aes(x = id, y = viewers, col = as.factor(season), shape=as.factor(season))) + geom_point() + ylab("Viewer") + xlab("Episode Number") + ggtitle("Rating vs Viewer") + geom_line() + geom_line() +
scale_shape_manual(values = rep(0:14, 3)) + theme(legend.position = "none")
ggplotly(tempPlot)
#ggplot(data = simpsons, aes(x=id, y = rating)) + geom_point()
#ggplot(data = simpsons, aes(x=original_air_date, y = rating)) + geom_point()
(oldViewers <- ggplot(data = old_simps, aes(x = id, y = us_viewers_in_millions, col = as.factor(season))) +
geom_point() + ylab("Viewers") + xlab("Episode Number") + ggtitle("Num Viewers vs Episode") + theme(legend.position = "none"))
us_viewers_in_millions values hard-coded as
N/A (char class)
simpsons$viewers <- as.numeric(simpsons$viewers)
(refactoredViewer <- ggplot(data = simpsons, aes(x = id, y = viewers, col = as.factor(season))) +
geom_point() + ylab("Viewers") + xlab("Episode Number") + ggtitle("Num Viewers vs Episode") +
scale_y_continuous(name="Viewers (in mil)", limits=c(0,30)))
## Warning: Removed 8 rows containing missing values (geom_point).
ggplot(data = tempSimps, aes(x = id, y = viewers, col = as.factor(season))) + geom_point()
#special_eps < simpsons %>% filter(simpsons, number_in_series = [4, 50, 20, 203])
THOH <- simpsons %>% filter(str_detect(title, 'Treehouse of Horror'))
ggplot(data = simpsons, aes(x = id, y = rating)) + geom_point() +
geom_point(data = THOH, aes(x = id, y = rating, col = "")) +
ggtitle("Treehouse of Horror Specials")
ggplot(data = simpsons, aes(x = id, y = as.numeric(viewers))) + geom_point() +
geom_point(data = THOH, aes(x = id, y = as.numeric(viewers), col = "")) +
ggtitle("Treehouse of Horror Specials")
## Warning: Removed 3 rows containing missing values (geom_point).
- In
general Treehouse of Horror (THOH) episodes seem to be on higher end of
rating per season - THOH 31 is lowest, and lower even within the
immediate range / season
length(unique(simpsons$writers))
## [1] 183
length(unique(simpsons$directors))
## [1] 57
mean(nchar(simpsons$writers))
## [1] 16.6069
avgWriter <- simpsons[which(nchar(simpsons$writers) > 16.6069 ),]
ggplot(simpsons, aes(writers, fill = writers)) + geom_bar() +
coord_polar("y", start = 0)
https://stat.ethz.ch/R-manual/R-devel/library/base/html/nchar.html ## Data Analysis
#———————-
```